In this vignette, I compare healthy and obese snRNA-seq data from human perivascular adipose tissue (PVAT). The two samples are taken from an original collection of three samples. We are only interested in two patients in this analysis:
There are 12,657 and 6,456 cells in samples 1 and 3 respectively that were sequenced on the Illumina NovaSeq 6000. The raw data can be found on the Gene Expression Omnibus website at this link.
The working directory can be set by running the following command in the terminal: setwd("/Users/hannahbazin/Desktop/Cambridge/Academics/Han_Lab/MPhil/mphil-project")
Because pathfindR includes a heuristic search algorithm to identify active subnetworks, we set a random seed for reproducibility.
# Set random seed for reproducibility
set.seed(42)
library(GEOquery)
## Loading required package: Biobase
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
## tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(Seurat)
## Attaching SeuratObject
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(patchwork)
library(ggplot2)
library(ggpubr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pathfindR)
## Loading required package: pathfindR.data
## ##############################################################################
## Welcome to pathfindR!
##
## Please cite the article below if you use pathfindR in published reseach:
##
## Ulgen E, Ozisik O, Sezerman OU. 2019. pathfindR: An R Package for Comprehensive
## Identification of Enriched Pathways in Omics Data Through Active Subnetworks.
## Front. Genet. doi:10.3389/fgene.2019.00858
##
## ##############################################################################
For consistency we reload the integrated Seurat object generated previously in the notebook titled human_PVAT_snRNAseq.Rmd.
# Load integrated Seurat object from previous analysis
load(file = "data/analysis/anno_combined.RData")
# Subset to samples 1 and 3
healthy <- subset(anno_combined, subset = sampleType == "GSM5068996")
obese <- subset(anno_combined, subset = sampleType == "GSM5068998")
# Plot UMAP per sample
healthy_umap <- DimPlot(healthy, reduction = "umap") + ggtitle("Healthy Sample")
obese_umap <- DimPlot(obese, reduction = "umap") + ggtitle("Obese Sample")
# Save plots
pdf("results/humanPVATsn/pathfindR/comparison1v3/UMAP_healthy.pdf", width = 12, height = 6)
print(healthy_umap)
invisible(dev.off())
pdf("results/humanPVATsn/pathfindR/comparison1v3/UMAP_obese.pdf", width = 12, height = 6)
print(obese_umap)
invisible(dev.off())
# Show plot
healthy_umap
obese_umap
Subsetting a Seurat object may cause the loss of the scaled data in the object. We verify this and recompute the scaled data for both patients. This is a time-consuming step, so it is best to save the R objects.
# Ensure default assay is RNA assay
DefaultAssay(healthy) <- "RNA"
DefaultAssay(obese) <- "RNA"
# Check that scaled data was preserved - this can be lost while subsetting
head(healthy@assays$RNA@scale.data) # NOT OK
## <0 x 0 matrix>
head(obese@assays$RNA@scale.data) # NOT OK
## <0 x 0 matrix>
# Scale data for each subset separately - if needed
# This is a time-consuming step, save the object
if (file.exists("data/analysis/healthy_scaled.RData")) {
message("Loading existing healthy scaled data file.")
load(file = "data/analysis/healthy_scaled.RData")
} else {
message("Missing healthy scaled data file. Scaling data now.")
healthy_scaled <- ScaleData(healthy, vars.to.regress = "percent.mt")
save(healthy_scaled, file = "data/analysis/healthy_scaled.RData")
}
## Loading existing healthy scaled data file.
if (file.exists("data/analysis/obese_scaled.RData")) {
message("Loading existing obese scaled data file.")
load(file = "data/analysis/obese_scaled.RData")
} else {
message("Missing obese scaled data file. Scaling data now.")
obese_scaled <- ScaleData(obese, vars.to.regress = "percent.mt")
save(obese_scaled, file = "data/analysis/obese_scaled.RData")
}
## Loading existing obese scaled data file.
The FindAllMarkers() function finds markers (i.e. differentially expressed genes) for each of the clusters in a dataset. By default, it uses a Wilcoxon Rank Sum test to identify differentially expressed genes between two groups.Because the function compares one cluster with all other clusters - including non-adipocytes ones, meaning it may identify pathways reflecting differences between adipocytes and non-adipocytes rather than differences between adipocyte developmental stages.
For this reason, we use the FindMarkers function instead. This function finds differentially expressed genes in adipocyte clusters by comparing them only to the three other adipocyte clusters. Seurat will combine all the clusters in ident.2 into a single reference group.
The parameters are set as follows:
- min.pct = 0.3 only tests genes detected in 30% of either of the two populations being compared. This speeds up the function and allows us to focus on more biologically relevant genes.
- logfc.threshold = 0.3 limits testing to genes that show at least 0.3-fold difference (log-scale) between the two groups (removes weaker signals).
# Select clusters in adipocyte lineage
adipo_clusters <- c("Early pre-adipocytes", "Intermediate pre-adipocytes", "Transitional adipocytes", "Mature adipocytes")
healthy_adipo <- subset(healthy_scaled, idents = adipo_clusters)
obese_adipo <- subset(obese_scaled, idents = adipo_clusters)
# Define marker file paths
healthy_marker_files <- paste0("results/humanPVATsn/pathfindR/comparison1v3/healthy_markers_", gsub(" ", "_", tolower(adipo_clusters)), ".csv")
obese_marker_files <- paste0("results/humanPVATsn/pathfindR/comparison1v3/obese_markers_", gsub(" ", "_", tolower(adipo_clusters)), ".csv")
# Function to run FindMarkers for each cluster
run_find_markers <- function(object, condition, marker_files) {
# Identify missing files
missing_files <- marker_files[!file.exists(marker_files)]
if (length(missing_files) > 0) {
message("Missing marker files: ", paste(missing_files, collapse=", "))
}
# Only run FindMarkers if any marker file is missing
if (!all(file.exists(marker_files))) {
for (cluster in adipo_clusters) {
markers <- FindMarkers(
object = object,
ident.1 = cluster,
ident.2 = setdiff(adipo_clusters, cluster),
min.pct = 0.3, # Min fraction of cells expressing the genes
logfc.threshold = 0.3, # Min log fold change
verbose = TRUE
)
# Save CSV
marker_file <- paste0("results/humanPVATsn/pathfindR/comparison1v3/", condition, "_markers_", gsub(" ", "_", tolower(cluster)), ".csv")
write.csv(markers, marker_file, row.names = TRUE)
message(paste0("Saved markers for ", condition, " - ", cluster, " to ", marker_file))
}
} else {
message(paste0("All marker files already exist for ", condition, ". Skipping FindMarkers()."))
}
}
# Ensure default assay is RNA assay
DefaultAssay(healthy_adipo) <- "RNA"
DefaultAssay(obese_adipo) <- "RNA"
# Run FindMarkers for healthy and obese
run_find_markers(healthy_adipo, "healthy", healthy_marker_files)
## All marker files already exist for healthy. Skipping FindMarkers().
run_find_markers(obese_adipo, "obese", obese_marker_files)
## All marker files already exist for obese. Skipping FindMarkers().
We can compute the fraction of cells within the adipocyte lineage part of each differentiation step.
# Function to calculate cluster percentages of adipocyte lineage
get_cluster_percentages <- function(seurat_obj) {
subset_ident <- seurat_obj@active.ident[seurat_obj@active.ident %in% adipo_clusters] # Filter only relevant clusters
total_cells <- length(subset_ident)
percentages <- sapply(adipo_clusters, function(cluster) (sum(subset_ident == cluster) / total_cells) * 100)
return(percentages)
}
# Compute percentages for both Seurat objects
healthy_percentages <- get_cluster_percentages(healthy)
obese_percentages <- get_cluster_percentages(obese)
# Create final dataframe
result_df <- data.frame(
Cluster = adipo_clusters,
Healthy = healthy_percentages,
Obese = obese_percentages
)
# Save as CSV
write.csv(result_df, "results/humanPVATsn/pathfindR/comparison1v3/cluster_percentages.csv", row.names = FALSE)
# Print dataframe
result_df
## Cluster Healthy Obese
## Early pre-adipocytes Early pre-adipocytes 15.777653 6.562974
## Intermediate pre-adipocytes Intermediate pre-adipocytes 39.668725 30.576631
## Transitional adipocytes Transitional adipocytes 5.923638 2.541730
## Mature adipocytes Mature adipocytes 38.629983 60.318665
pathfindR analysispathfindR is an R package designed for active-subnetwork-oriented pathway enrichment analysis. Unlike traditional over-representation analysis (ORA) and gene set enrichment analysis (GSEA), which evaluate gene lists without considering gene interactions, pathfindR integrates protein-protein interaction networks (PINs) to identify active subnetworks – groups of interacting genes enriched in significantly altered genes. A subnetwork is defined as a cluster of interconnected genes in a PIN, and it is considered active if it contains a disproportionately high number of differentially expressed genes. The algorithm then performs pathway enrichment analysis on these subnetworks to identify biologically relevant pathways.
The pathfindR workflow consists of the following steps:
Load marker files for healthy patient.
if (all(file.exists(healthy_marker_files))) {
h_early <- read_csv(healthy_marker_files[1])
h_intermediate <- read_csv(healthy_marker_files[2])
h_transitional <- read_csv(healthy_marker_files[3])
h_mature <- read_csv(healthy_marker_files[4])
} else {
stop("One or more healthy marker files are missing. Run FindMarkers first.")
}
## New names:
## Rows: 1040 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 1107 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 204 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 1530 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Load marker files for obese patient.
if (all(file.exists(obese_marker_files))) {
o_early <- read_csv(obese_marker_files[1])
o_intermediate <- read_csv(obese_marker_files[2])
o_transitional <- read_csv(obese_marker_files[3])
o_mature <- read_csv(obese_marker_files[4])
} else {
stop("One or more obese marker files are missing. Run FindMarkers first.")
}
## New names:
## Rows: 1525 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 1457 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 192 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 1507 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
The data needs to be reformatted to align with the expected input of pathfindR. The input data frame must consist of columns containing: gene symbols, change values (optional) and p values.
# Make function to reformat to pathfindR input
reformat_pathfindR <- function(input) {
input %>%
select(Gene.symbol = ...1, logFC = avg_log2FC, adj.P.Val = p_val_adj) %>%
as.data.frame()
}
# Reformat healthy markers
h_early <- reformat_pathfindR(h_early)
h_intermediate <- reformat_pathfindR(h_intermediate)
h_transitional <- reformat_pathfindR(h_transitional)
h_mature <- reformat_pathfindR(h_mature)
# Reformat obese markers
o_early <- reformat_pathfindR(o_early)
o_intermediate <- reformat_pathfindR(o_intermediate)
o_transitional <- reformat_pathfindR(o_transitional)
o_mature <- reformat_pathfindR(o_mature)
pathfindRFor early pre-adipocytes
h_early_output <- run_pathfindR(h_early,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 1040
## Number of genes in input after p-value filtering: 976
## pathfindR cannot handle p values < 1e-13. These were changed to 1e-13
##
## Could not find any interactions for 60 (6.15%) genes in the PIN
## Final number of genes in input: 916
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 160 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
For intermediate pre-adipocytes
h_inter_output <- run_pathfindR(h_intermediate,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 1107
## Number of genes in input after p-value filtering: 965
## pathfindR cannot handle p values < 1e-13. These were changed to 1e-13
## Could not find any interactions for 78 (8.08%) genes in the PIN
## Final number of genes in input: 887
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 172 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
For transitional adipocytes
h_trans_output <- run_pathfindR(h_transitional,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 204
## Number of genes in input after p-value filtering: 73
## Could not find any interactions for 9 (12.33%) genes in the PIN
## Final number of genes in input: 64
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 27 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
For mature adipocytes
h_mature_output <- run_pathfindR(h_mature,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 1530
## Number of genes in input after p-value filtering: 1336
## pathfindR cannot handle p values < 1e-13. These were changed to 1e-13
## Could not find any interactions for 110 (8.23%) genes in the PIN
## Final number of genes in input: 1226
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 207 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
For early preadipocytes
o_early_output <- run_pathfindR(o_early,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 1525
## Number of genes in input after p-value filtering: 1016
## pathfindR cannot handle p values < 1e-13. These were changed to 1e-13
## Could not find any interactions for 54 (5.31%) genes in the PIN
## Final number of genes in input: 962
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 158 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
For intermediate pre-adipocytes
o_inter_output <- run_pathfindR(o_intermediate,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 1457
## Number of genes in input after p-value filtering: 1207
## pathfindR cannot handle p values < 1e-13. These were changed to 1e-13
## Could not find any interactions for 76 (6.3%) genes in the PIN
## Final number of genes in input: 1131
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 245 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
For transitional adipocytes, the analysis is not possible because the obese patient does not have enough of these cells, therefore no conclusions can be drawn.
For mature adipocytes
o_mature_output <- run_pathfindR(o_mature,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 1507
## Number of genes in input after p-value filtering: 1318
## pathfindR cannot handle p values < 1e-13. These were changed to 1e-13
## Could not find any interactions for 80 (6.07%) genes in the PIN
## Final number of genes in input: 1238
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 269 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
pathfindR resultsThe pathfindR package provides a function to compare two different pathfindR output dataframes. More details can be found in this vignette.
For early-pre-adipocytes
# Combine results
combined_early <- combine_pathfindR_results(
result_A = h_early_output,
result_B = o_early_output
)
## Warning: ggrepel: 222 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## You may run `combined_results_graph()` to create visualizations of combined term-gene graphs of selected terms
For intermediate pre-adipocytes
# Combine results
combined_intermediate <- combine_pathfindR_results(
result_A = h_inter_output,
result_B = o_inter_output
)
## Warning: ggrepel: 433 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## You may run `combined_results_graph()` to create visualizations of combined term-gene graphs of selected terms
For transitional adipocytes, the comparison is not possible because the obese patient does not show a sufficient quantity of transitional adipocyte cells, therefore no conclusions can be drawn.
For mature adipocytes
# Combine results
combined_mature <- combine_pathfindR_results(
result_A = h_mature_output,
result_B = o_mature_output
)
## Warning: ggrepel: 605 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## You may run `combined_results_graph()` to create visualizations of combined term-gene graphs of selected terms
We save the files to streamline the workflow, allowing for quick access when rerunning only the visualisation chunks. In the resulting dataframes, “A only” represents pathways exclusive to the healthy group, while “B only” corresponds to pathways unique to the obese group.
write.csv(combined_early, "results/humanPVATsn/pathfindR/comparison1v3/early_pre_healthy_vs_obese.csv", row.names = TRUE)
write.csv(combined_intermediate, "results/humanPVATsn/pathfindR/comparison1v3/intermediate_pre_healthy_vs_obese.csv", row.names = TRUE)
write.csv(combined_mature, "results/humanPVATsn/pathfindR/comparison1v3/mature_healthy_vs_obese.csv", row.names = TRUE)
Load data and list patient conditions.
# Load data
early_df <- read_csv("results/humanPVATsn/pathfindR/comparison1v3/early_pre_healthy_vs_obese.csv")
## New names:
## Rows: 233 Columns: 19
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): ID, Term_Description, Up_regulated_A, Down_regulated_A, Up_regulat... dbl
## (12): ...1, Fold_Enrichment_A, occurrence_A, support_A, lowest_p_A, high...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
intermediate_df <- read_csv("results/humanPVATsn/pathfindR/comparison1v3/intermediate_pre_healthy_vs_obese.csv")
## New names:
## Rows: 305 Columns: 19
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): ID, Term_Description, Up_regulated_A, Down_regulated_A, Up_regulat... dbl
## (12): ...1, Fold_Enrichment_A, occurrence_A, support_A, lowest_p_A, high...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
mature_df <- read_csv("results/humanPVATsn/pathfindR/comparison1v3/mature_healthy_vs_obese.csv")
## New names:
## Rows: 341 Columns: 19
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): ID, Term_Description, Up_regulated_A, Down_regulated_A, Up_regulat... dbl
## (12): ...1, Fold_Enrichment_A, occurrence_A, support_A, lowest_p_A, high...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# List of conditions
conditions <- c("A only", "B only", "common")
condition_labels <- list("A only" = "Healthy Only", "B only" = "Obese Only", "common" = "Both Healthy and Obese")
Define function to create and save bar plots. - For the A only and B only pathways, we rank them based on fold enrichment. - For the common pathways, we rank them based on the combined p-value; this helps identify pathways that are consistently significant in both datasets.
create_and_save_bar_plot <- function(df, category, title_label, output_file) {
# Define sorting column
sorting_col <- ifelse(category == "common", "combined_p", paste0("Fold_Enrichment_", substr(category, 1, 1)))
# Filter top 20 enriched pathways
if(category == "common") {
df_filtered <- df %>%
filter(status == category) %>%
arrange(combined_p) %>%
slice_head(n = 20)
} else {
df_filtered <- df %>%
filter(status == category) %>%
arrange(desc(.data[[sorting_col]])) %>%
slice_head(n = 20)
}
# Ensure factor levels are correctly ordered for plotting
if(category == "common") {
df_filtered <- df_filtered %>%
mutate(Term_Description = factor(Term_Description,
levels = Term_Description))
} else {
df_filtered <- df_filtered %>%
mutate(Term_Description = factor(Term_Description, levels = rev(Term_Description)))
}
# Create bar plot
plot <- ggplot(df_filtered, aes(x = .data[[sorting_col]], y = Term_Description, fill = Term_Description)) +
geom_bar(stat = "identity", color = "black") +
labs(title = title_label, x = ifelse(category == "common", "Combined P Value", "Fold Enrichment"),
y = "Pathway") +
theme_minimal() +
theme(legend.position = "none")
# Save plot
ggsave(output_file, plot, width = 12, height = 6, device = "pdf")
# Return plot
return(plot)
}
Generate plots for each differentiation stage: early pre-adipocytes, intermediate pre-adipocytes, and mature adipocytes.
for (condition in conditions) {
# Early adipocytes
early_plot <- create_and_save_bar_plot(early_df, condition, paste("Early Pre-adipocytes: Enriched Pathways for", condition_labels[[condition]]), paste0("results/humanPVATsn/pathfindR/comparison1v3/early_", gsub(" ", "_", condition), ".pdf"))
print(early_plot)
# Intermediate adipocytes
inter_plot <- create_and_save_bar_plot(intermediate_df, condition, paste("Intermediate Pre-adipocytes: Enriched Pathways for", condition_labels[[condition]]), paste0("results/humanPVATsn/pathfindR/comparison1v3/intermediate_", gsub(" ", "_", condition), ".pdf"))
print(inter_plot)
# Mature adipocytes
mature_plot <- create_and_save_bar_plot(mature_df, condition, paste("Mature Adipocytes: Enriched Pathways for", condition_labels[[condition]]), paste0("results/humanPVATsn/pathfindR/comparison1v3/mature_", gsub(" ", "_", condition), ".pdf"))
print(mature_plot)
}
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.0
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/London
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] pathfindR_2.4.1 pathfindR.data_2.1.0 lubridate_1.9.4
## [4] forcats_1.0.0 stringr_1.5.1 purrr_1.0.2
## [7] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [10] tidyverse_2.0.0 ggpubr_0.6.0.999 ggplot2_3.5.1
## [13] patchwork_1.3.0 dplyr_1.1.4 SeuratObject_4.1.4
## [16] Seurat_4.4.0 GEOquery_2.72.0 Biobase_2.64.0
## [19] BiocGenerics_0.50.0 BiocStyle_2.32.1
##
## loaded via a namespace (and not attached):
## [1] RcppAnnoy_0.0.22 splines_4.4.1 later_1.4.1
## [4] polyclip_1.10-7 lifecycle_1.0.4 rstatix_0.7.2
## [7] doParallel_1.0.17 globals_0.16.3 lattice_0.22-6
## [10] vroom_1.6.5 MASS_7.3-64 backports_1.5.0
## [13] magrittr_2.0.3 limma_3.60.6 plotly_4.10.4
## [16] sass_0.4.9 rmarkdown_2.29 jquerylib_0.1.4
## [19] yaml_2.3.10 httpuv_1.6.15 sctransform_0.4.1
## [22] sp_2.2-0 spatstat.sparse_3.1-0 reticulate_1.40.0
## [25] DBI_1.2.3 cowplot_1.1.3 pbapply_1.7-2
## [28] RColorBrewer_1.1-3 zlibbioc_1.50.0 abind_1.4-8
## [31] Rtsne_0.17 ggraph_2.2.1 tweenr_2.0.3
## [34] GenomeInfoDbData_1.2.12 IRanges_2.38.1 S4Vectors_0.42.1
## [37] ggrepel_0.9.6 irlba_2.3.5.1 listenv_0.9.1
## [40] spatstat.utils_3.1-2 goftest_1.2-3 spatstat.random_3.3-2
## [43] fitdistrplus_1.2-2 parallelly_1.42.0 leiden_0.4.3.1
## [46] codetools_0.2-20 xml2_1.3.6 ggforce_0.4.2
## [49] tidyselect_1.2.1 UCSC.utils_1.0.0 farver_2.1.2
## [52] viridis_0.6.5 stats4_4.4.1 matrixStats_1.5.0
## [55] spatstat.explore_3.3-4 jsonlite_1.8.9 tidygraph_1.3.1
## [58] progressr_0.15.1 Formula_1.2-5 ggridges_0.5.6
## [61] survival_3.8-3 iterators_1.0.14 systemfonts_1.2.1
## [64] foreach_1.5.2 tools_4.4.1 ragg_1.3.3
## [67] ica_1.0-3 Rcpp_1.0.14 glue_1.8.0
## [70] gridExtra_2.3 xfun_0.50 GenomeInfoDb_1.40.1
## [73] withr_3.0.2 BiocManager_1.30.25 fastmap_1.2.0
## [76] digest_0.6.37 timechange_0.3.0 R6_2.5.1
## [79] mime_0.12 textshaping_1.0.0 colorspace_2.1-1
## [82] scattermore_1.2 tensor_1.5 RSQLite_2.3.9
## [85] spatstat.data_3.1-4 generics_0.1.3 data.table_1.16.4
## [88] graphlayouts_1.2.2 httr_1.4.7 htmlwidgets_1.6.4
## [91] uwot_0.2.2 pkgconfig_2.0.3 gtable_0.3.6
## [94] blob_1.2.4 lmtest_0.9-40 XVector_0.44.0
## [97] htmltools_0.5.8.1 carData_3.0-5 bookdown_0.42
## [100] scales_1.3.0 png_0.1-8 spatstat.univar_3.1-1
## [103] knitr_1.49 rstudioapi_0.17.1 tzdb_0.4.0
## [106] reshape2_1.4.4 nlme_3.1-167 org.Hs.eg.db_3.20.0
## [109] cachem_1.1.0 zoo_1.8-12 KernSmooth_2.23-26
## [112] parallel_4.4.1 miniUI_0.1.1.1 AnnotationDbi_1.68.0
## [115] pillar_1.10.1 grid_4.4.1 vctrs_0.6.5
## [118] RANN_2.6.2 promises_1.3.2 car_3.1-3
## [121] xtable_1.8-4 cluster_2.1.8 evaluate_1.0.3
## [124] tinytex_0.54 magick_2.8.5 cli_3.6.3
## [127] compiler_4.4.1 crayon_1.5.3 rlang_1.1.5
## [130] future.apply_1.11.3 ggsignif_0.6.4 labeling_0.4.3
## [133] plyr_1.8.9 stringi_1.8.4 viridisLite_0.4.2
## [136] deldir_2.0-4 Biostrings_2.72.1 munsell_0.5.1
## [139] lazyeval_0.2.2 spatstat.geom_3.3-5 Matrix_1.7-2
## [142] hms_1.1.3 bit64_4.6.0-1 future_1.34.0
## [145] KEGGREST_1.44.1 statmod_1.5.0 shiny_1.10.0
## [148] ROCR_1.0-11 igraph_2.1.4 broom_1.0.7
## [151] memoise_2.0.1 bslib_0.9.0 bit_4.5.0.1